1、Go through “R for Beginners” if you are not familiar with R programming.
2、Use knitr to produce at least 3 examples (texts, figures, tables).
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
Make a summary of the data set of mtcars
mpg means Miles/(US) gallon,cyl means Number of cylinders.
knitr::kable(mtcars[,1:7])
| mpg | cyl | disp | hp | drat | wt | qsec | |
|---|---|---|---|---|---|---|---|
| Mazda RX4 | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 |
| Mazda RX4 Wag | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 |
| Datsun 710 | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 |
| Hornet 4 Drive | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 |
| Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 |
| Valiant | 18.1 | 6 | 225.0 | 105 | 2.76 | 3.460 | 20.22 |
| Duster 360 | 14.3 | 8 | 360.0 | 245 | 3.21 | 3.570 | 15.84 |
| Merc 240D | 24.4 | 4 | 146.7 | 62 | 3.69 | 3.190 | 20.00 |
| Merc 230 | 22.8 | 4 | 140.8 | 95 | 3.92 | 3.150 | 22.90 |
| Merc 280 | 19.2 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.30 |
| Merc 280C | 17.8 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.90 |
| Merc 450SE | 16.4 | 8 | 275.8 | 180 | 3.07 | 4.070 | 17.40 |
| Merc 450SL | 17.3 | 8 | 275.8 | 180 | 3.07 | 3.730 | 17.60 |
| Merc 450SLC | 15.2 | 8 | 275.8 | 180 | 3.07 | 3.780 | 18.00 |
| Cadillac Fleetwood | 10.4 | 8 | 472.0 | 205 | 2.93 | 5.250 | 17.98 |
| Lincoln Continental | 10.4 | 8 | 460.0 | 215 | 3.00 | 5.424 | 17.82 |
| Chrysler Imperial | 14.7 | 8 | 440.0 | 230 | 3.23 | 5.345 | 17.42 |
| Fiat 128 | 32.4 | 4 | 78.7 | 66 | 4.08 | 2.200 | 19.47 |
| Honda Civic | 30.4 | 4 | 75.7 | 52 | 4.93 | 1.615 | 18.52 |
| Toyota Corolla | 33.9 | 4 | 71.1 | 65 | 4.22 | 1.835 | 19.90 |
| Toyota Corona | 21.5 | 4 | 120.1 | 97 | 3.70 | 2.465 | 20.01 |
| Dodge Challenger | 15.5 | 8 | 318.0 | 150 | 2.76 | 3.520 | 16.87 |
| AMC Javelin | 15.2 | 8 | 304.0 | 150 | 3.15 | 3.435 | 17.30 |
| Camaro Z28 | 13.3 | 8 | 350.0 | 245 | 3.73 | 3.840 | 15.41 |
| Pontiac Firebird | 19.2 | 8 | 400.0 | 175 | 3.08 | 3.845 | 17.05 |
| Fiat X1-9 | 27.3 | 4 | 79.0 | 66 | 4.08 | 1.935 | 18.90 |
| Porsche 914-2 | 26.0 | 4 | 120.3 | 91 | 4.43 | 2.140 | 16.70 |
| Lotus Europa | 30.4 | 4 | 95.1 | 113 | 3.77 | 1.513 | 16.90 |
| Ford Pantera L | 15.8 | 8 | 351.0 | 264 | 4.22 | 3.170 | 14.50 |
| Ferrari Dino | 19.7 | 6 | 145.0 | 175 | 3.62 | 2.770 | 15.50 |
| Maserati Bora | 15.0 | 8 | 301.0 | 335 | 3.54 | 3.570 | 14.60 |
| Volvo 142E | 21.4 | 4 | 121.0 | 109 | 4.11 | 2.780 | 18.60 |
Generated a table of the first seven columns of the data set of mtcars
boxplot(mpg ~ cyl,data = mtcars,
main="Car Mileage Data",
xlab = "Number of cylinders",
ylab = "Miles/(US) gallon")
We can conclude that the six-cylinder model has a more even mileage per gallon of gasoline than the other two models.Compared to the six-cylinder and eight-cylinder models, the four-cylinder model has the widest number of miles per gallon of gasoline.
states <- as.data.frame(state.x77[,c("Population","Murder","Illiteracy","Income","Frost")])
summary(states)
## Population Murder Illiteracy Income
## Min. : 365 Min. : 1.400 Min. :0.500 Min. :3098
## 1st Qu.: 1080 1st Qu.: 4.350 1st Qu.:0.625 1st Qu.:3993
## Median : 2838 Median : 6.850 Median :0.950 Median :4519
## Mean : 4246 Mean : 7.378 Mean :1.170 Mean :4436
## 3rd Qu.: 4968 3rd Qu.:10.675 3rd Qu.:1.575 3rd Qu.:4814
## Max. :21198 Max. :15.100 Max. :2.800 Max. :6315
## Frost
## Min. : 0.00
## 1st Qu.: 66.25
## Median :114.50
## Mean :104.46
## 3rd Qu.:139.75
## Max. :188.00
Convert the matrix state.x77 into a data frame,and make a summary of the data frame
knitr::kable(states)
| Population | Murder | Illiteracy | Income | Frost | |
|---|---|---|---|---|---|
| Alabama | 3615 | 15.1 | 2.1 | 3624 | 20 |
| Alaska | 365 | 11.3 | 1.5 | 6315 | 152 |
| Arizona | 2212 | 7.8 | 1.8 | 4530 | 15 |
| Arkansas | 2110 | 10.1 | 1.9 | 3378 | 65 |
| California | 21198 | 10.3 | 1.1 | 5114 | 20 |
| Colorado | 2541 | 6.8 | 0.7 | 4884 | 166 |
| Connecticut | 3100 | 3.1 | 1.1 | 5348 | 139 |
| Delaware | 579 | 6.2 | 0.9 | 4809 | 103 |
| Florida | 8277 | 10.7 | 1.3 | 4815 | 11 |
| Georgia | 4931 | 13.9 | 2.0 | 4091 | 60 |
| Hawaii | 868 | 6.2 | 1.9 | 4963 | 0 |
| Idaho | 813 | 5.3 | 0.6 | 4119 | 126 |
| Illinois | 11197 | 10.3 | 0.9 | 5107 | 127 |
| Indiana | 5313 | 7.1 | 0.7 | 4458 | 122 |
| Iowa | 2861 | 2.3 | 0.5 | 4628 | 140 |
| Kansas | 2280 | 4.5 | 0.6 | 4669 | 114 |
| Kentucky | 3387 | 10.6 | 1.6 | 3712 | 95 |
| Louisiana | 3806 | 13.2 | 2.8 | 3545 | 12 |
| Maine | 1058 | 2.7 | 0.7 | 3694 | 161 |
| Maryland | 4122 | 8.5 | 0.9 | 5299 | 101 |
| Massachusetts | 5814 | 3.3 | 1.1 | 4755 | 103 |
| Michigan | 9111 | 11.1 | 0.9 | 4751 | 125 |
| Minnesota | 3921 | 2.3 | 0.6 | 4675 | 160 |
| Mississippi | 2341 | 12.5 | 2.4 | 3098 | 50 |
| Missouri | 4767 | 9.3 | 0.8 | 4254 | 108 |
| Montana | 746 | 5.0 | 0.6 | 4347 | 155 |
| Nebraska | 1544 | 2.9 | 0.6 | 4508 | 139 |
| Nevada | 590 | 11.5 | 0.5 | 5149 | 188 |
| New Hampshire | 812 | 3.3 | 0.7 | 4281 | 174 |
| New Jersey | 7333 | 5.2 | 1.1 | 5237 | 115 |
| New Mexico | 1144 | 9.7 | 2.2 | 3601 | 120 |
| New York | 18076 | 10.9 | 1.4 | 4903 | 82 |
| North Carolina | 5441 | 11.1 | 1.8 | 3875 | 80 |
| North Dakota | 637 | 1.4 | 0.8 | 5087 | 186 |
| Ohio | 10735 | 7.4 | 0.8 | 4561 | 124 |
| Oklahoma | 2715 | 6.4 | 1.1 | 3983 | 82 |
| Oregon | 2284 | 4.2 | 0.6 | 4660 | 44 |
| Pennsylvania | 11860 | 6.1 | 1.0 | 4449 | 126 |
| Rhode Island | 931 | 2.4 | 1.3 | 4558 | 127 |
| South Carolina | 2816 | 11.6 | 2.3 | 3635 | 65 |
| South Dakota | 681 | 1.7 | 0.5 | 4167 | 172 |
| Tennessee | 4173 | 11.0 | 1.7 | 3821 | 70 |
| Texas | 12237 | 12.2 | 2.2 | 4188 | 35 |
| Utah | 1203 | 4.5 | 0.6 | 4022 | 137 |
| Vermont | 472 | 5.5 | 0.6 | 3907 | 168 |
| Virginia | 4981 | 9.5 | 1.4 | 4701 | 85 |
| Washington | 3559 | 4.3 | 0.6 | 4864 | 32 |
| West Virginia | 1799 | 6.7 | 1.4 | 3617 | 100 |
| Wisconsin | 4589 | 3.0 | 0.7 | 4468 | 149 |
| Wyoming | 376 | 6.9 | 0.6 | 4566 | 173 |
Generated a table of the data frame of states
lm.D1 <- lm(Income~Illiteracy,data = states)
summary(lm.D1)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4951.3198 172.2848 28.739158 6.684386e-32
## Illiteracy -440.6152 130.8723 -3.366757 1.505073e-03
The \(R^2\) is 0.1910347
par(mar=c(1,1,1,1))
par(mfrow=c(2,2))
plot(lm.D1)
### example3:
y <-1700:1988
sunspotData <- data.frame(y,sunspot.year)
summary(sunspotData)
## y sunspot.year
## Min. :1700 Min. : 0.00
## 1st Qu.:1772 1st Qu.: 15.60
## Median :1844 Median : 39.00
## Mean :1844 Mean : 48.61
## 3rd Qu.:1916 3rd Qu.: 68.90
## Max. :1988 Max. :190.20
Generated a data frame of sunspot data from 1700 to 1988.
knitr::kable(sunspotData[1:20,])
| y | sunspot.year |
|---|---|
| 1700 | 5 |
| 1701 | 11 |
| 1702 | 16 |
| 1703 | 23 |
| 1704 | 36 |
| 1705 | 58 |
| 1706 | 29 |
| 1707 | 20 |
| 1708 | 10 |
| 1709 | 8 |
| 1710 | 3 |
| 1711 | 0 |
| 1712 | 0 |
| 1713 | 2 |
| 1714 | 11 |
| 1715 | 27 |
| 1716 | 47 |
| 1717 | 63 |
| 1718 | 60 |
| 1719 | 39 |
Generated a table of the first twenty rows of the data frame of sunspotData
par(mar=c(1,1,1,1))
par(mar=c(5,4,2,0.1) + 0.1)
plot(y,sunspot.year,type = "b",lty=1,pch=21,bg="red",
main = "Yearly Sunspot Data, 1700-1988")
Generate a dotted line diagram of sunspotData,it can be found that the number of sunspots has been fluctuating between 0 and 200.
Exercises 3.4, 3.11, and 3.18 (pages 94-96, Statistical Computating with R).
The Rayleigh density is \[ f(x)=\frac{x}{\sigma^{2}} e^{-x^{2} /\left(2 \sigma^{2}\right)}, \quad x \geq 0, \sigma>0 \] Its cumulative distribution function is\[ F(x)=1-e^{-\frac{x^{2}}{2\sigma^2}}, \quad x\geq 0 \] According to the inverse transformation method,we can generate random samples from a Rayleigh(σ) distribution.In the first example,take σ=1,we can find that the mode of the generated samples is close to the theoretical mode σ(check the histogram).
n <- 1000
set.seed(1235)
u <- runif(n)
sigma <- 1
x <- sigma*sqrt(-2*log(u)) # F(x) = 1-exp(-x^2/(2*sigma^2))
hist(x, prob = TRUE, breaks = seq(0, 4, .1),
main = expression(f(x)== x/(sigma^2)*e(-x^2/(2*sigma^2))),
col = "yellow"
)
y <- seq(0, 4, .01)
lines(y, y/(sigma^2)*exp(-y^2/(2*sigma^2)),col = "red")
Continue, we take σ=5 and σ=10 respectively,the empirical distribution of the sample is still close to the true distribution ### σ=5 ###
n <- 1000
set.seed(1234)
u <- runif(n)
sigma <- 5
x <- sigma*sqrt(-2*log(u)) # F(x) = 1-exp(-x^2/(2*sigma^2))
hist(x, prob = TRUE, breaks = seq(0, 20, .4),
main = expression(f(x)== x/(sigma^2)*e(-x^2/(2*sigma^2))),
col = "yellow"
)
y <- seq(0, 20, .01)
lines(y, y/(sigma^2)*exp(-y^2/(2*sigma^2)),col = "red")
### σ=10 ###
n <- 1000
set.seed(1235)
u <- runif(n)
sigma <- 10
x <- sigma*sqrt(-2*log(u)) # F(x) = 1-exp(-x^2/(2*sigma^2))
hist(x, prob = TRUE, breaks = seq(0,40, 1),
main = expression(f(x)== x/(sigma^2)*e(-x^2/(2*sigma^2))),
col = "yellow"
)
y <- seq(0,40, .01)
lines(y, y/(sigma^2)*exp(-y^2/(2*sigma^2)),col = "red")
## Exercise 3.11 Generate a random sample of size 1000 from a normal location mixture. The components of the mixture have N (0,1) and N (3,1) distributions with mixing probabilities p1 and p2 = 1 − p1. Graph the histogram of the sample with density superimposed, for p1 = 0.75.We can find the empirical distribution of the mixture appears to be bimodal
n <- 1000
p1 <- 0.75
p2 <- 1 - p1
set.seed(1234)
X1 <- rnorm(n,0,1)
X2 <- rnorm(n,3,1)
r <- sample(c(1,0),n,replace=TRUE,prob = c(p1,p2))
Z1 <- r*X1+(1-r)*X2
hist(Z1, prob = TRUE, breaks = seq(-4, 8, .2),
main = expression(f(x)== (0.75/sqrt(2*pi))*e(-x^2/2)+
(0.25/sqrt(2*pi))*e(-(x-3)^2/2)),
col = "blue"
)
y <- seq(-4, 8, .01)
lines(y, 0.75/sqrt(2*pi)*exp(-y^2/2)+
0.25/sqrt(2*pi)*exp(-(y-3)^2/2),
col = "red")
Continue, we take p1=0.1, 0.3, 0.5, 0.6, 0.8, 0.9 respectively to observe whether the empirical distribution of the mixture appears to be bimodal
n <- 1000
p1 <- 0.1
p2 <- 1 - p1
set.seed(8)
X1 <- rnorm(n,0,1)
X2 <- rnorm(n,3,1)
r <- sample(c(1,0),n,replace=TRUE,prob = c(p1,p2))
Z1 <- r*X1+(1-r)*X2
hist(Z1, prob = TRUE, breaks = seq(-4, 10, .25),
main = expression(f(x)== (0.1/sqrt(2*pi))*e(-x^2/2)+
(0.9/sqrt(2*pi))*e(-(x-3)^2/2)),
col = "blue"
)
y <- seq(-4, 10, .01)
lines(y, 0.1/sqrt(2*pi)*exp(-y^2/2)+
0.9/sqrt(2*pi)*exp(-(y-3)^2/2),
col = "red")
n <- 1000
p1 <- 0.3
p2 <- 1 - p1
set.seed(6)
X1 <- rnorm(n,0,1)
X2 <- rnorm(n,3,1)
r <- sample(c(1,0),n,replace=TRUE,prob = c(p1,p2))
Z1 <- r*X1+(1-r)*X2
hist(Z1, prob = TRUE, breaks = seq(-4, 10, .3),
main = expression(f(x)== (0.3/sqrt(2*pi))*e(-x^2/2)+
(0.7/sqrt(2*pi))*e(-(x-3)^2/2)),
col = "blue"
)
y <- seq(-4, 10, .01)
lines(y, 0.3/sqrt(2*pi)*exp(-y^2/2)+
0.7/sqrt(2*pi)*exp(-(y-3)^2/2),
col = "red")
n <- 1000
p1 <- 0.5
p2 <- 1 - p1
set.seed(12)
X1 <- rnorm(n,0,1)
X2 <- rnorm(n,3,1)
r <- sample(c(1,0),n,replace=TRUE,prob = c(p1,p2))
Z1 <- r*X1+(1-r)*X2
hist(Z1, prob = TRUE, breaks = seq(-4, 8, .3),
main = expression(f(x)== (0.5/sqrt(2*pi))*e(-x^2/2)+
(0.5/sqrt(2*pi))*e(-(x-3)^2/2)),
col = "blue"
)
y <- seq(-4, 8, .01)
lines(y, 0.5/sqrt(2*pi)*exp(-y^2/2)+
0.5/sqrt(2*pi)*exp(-(y-3)^2/2),
col = "red")
n <- 1000
p1 <- 0.6
p2 <- 1 - p1
set.seed(1234)
X1 <- rnorm(n,0,1)
X2 <- rnorm(n,3,1)
r <- sample(c(1,0),n,replace=TRUE,prob = c(p1,p2))
Z1 <- r*X1+(1-r)*X2
hist(Z1, prob = TRUE, breaks = seq(-4, 8, .3),
main = expression(f(x)== (0.6/sqrt(2*pi))*e(-x^2/2)+
(0.4/sqrt(2*pi))*e(-(x-3)^2/2)),
col = "blue"
)
y <- seq(-4, 8, .01)
lines(y, 0.6/sqrt(2*pi)*exp(-y^2/2)+
0.4/sqrt(2*pi)*exp(-(y-3)^2/2),
col = "red")
n <- 1000
p1 <- 0.8
p2 <- 1 - p1
set.seed(19)
X1 <- rnorm(n,0,1)
X2 <- rnorm(n,3,1)
r <- sample(c(1,0),n,replace=TRUE,prob = c(p1,p2))
Z1 <- r*X1+(1-r)*X2
hist(Z1, prob = TRUE, breaks = seq(-4, 6, .15),
main = expression(f(x)== (0.8/sqrt(2*pi))*e(-x^2/2)+
(0.2/sqrt(2*pi))*e(-(x-3)^2/2)),
col = "blue"
)
y <- seq(-4, 6, .2)
lines(y, 0.8/sqrt(2*pi)*exp(-y^2/2)+
0.2/sqrt(2*pi)*exp(-(y-3)^2/2),
col = "red")
n <- 1000
p1 <- 0.9
p2 <- 1 - p1
set.seed(5)
X1 <- rnorm(n,0,1)
X2 <- rnorm(n,3,1)
r <- sample(c(1,0),n,replace=TRUE,prob = c(p1,p2))
Z1 <- r*X1+(1-r)*X2
hist(Z1, prob = TRUE, breaks = seq(-4, 6, .25),
main = expression(f(x)== (0.9/sqrt(2*pi))*e(-x^2/2)+
(0.1/sqrt(2*pi))*e(-(x-3)^2/2)),
col = "blue"
)
y <- seq(-4, 6, .01)
lines(y, 0.9/sqrt(2*pi)*exp(-y^2/2)+
0.1/sqrt(2*pi)*exp(-(y-3)^2/2),
col = "red")
By observing the above experimental results,we can conjecture that the values of p1 that produce bimodal mixtures is between 0.25 and 0.75.When p1<0.25 or p1>0.75, bimodal mixtures may produce, but not obvious.
Write a function to generate a random sample from a \(W_{d}(\Sigma, n)\)(Wishart) distribution for n > d + 1 ≥ 1, based on Bartlett’s decomposition.We take n=6,d=4,and\[Σ= \left[\begin{array}{cccc}{1} & {0.7} & {0.3} & {0.9} \\ {0.7} & {1} & {0.2} & {0.6} \\ {0.3} & {0.2} & {1} & {0.5} \\ {0.9} & {0.6} & {0.5} & {1}\end{array}\right] \]
set.seed(1234)
fun <- function(d,n){
T = matrix(0,nrow=d,ncol=d)
for(j in 1:d){
for(i in 1:d){
if(i>j) {T[i,j] <- rnorm(1)}
else if(i==j) {T[i,j] <- sqrt(rchisq(1,n-i+1))}
else {T[i,j]=0}
}
}
T
}#generate the matrix A
A = fun(4,6) #d=4,n=6
sigma=matrix(c(1,0.7,0.3,0.9,
0.7,1,0.2,0.6,
0.3,0.2,1,0.5,
0.9,0.6,0.5,1),nrow = 4)
L = t(chol(sigma))#Bartlett’s decomposition.
W = L%*%A%*%t(A)%*%t(L)
W
## [,1] [,2] [,3] [,4]
## [1,] 1.911427 1.648385 1.041144 1.456826
## [2,] 1.648385 3.513408 1.012706 1.918667
## [3,] 1.041144 1.012706 5.934319 1.661255
## [4,] 1.456826 1.918667 1.661255 1.553874
The matrix W is a random sample that generate from the function I write
5.1、Compute a Monte Carlo estimate of \[ \int_{0}^{\pi / 3} \sin t d t\] and compare your estimate with the exact value of the integral.
5.10、Use Monte Carlo integration with antithetic variables to estimate \[ \int_{0}^{1} \frac{e^{-x}}{1+x^{2}} d x, \] and find the approximate reduction in variance as a percentage of the variance without variance reduction.
5.15、Obtain the stratified importance sampling estimate in Example 5.13 and compare it with the result of Example 5.10.
m <- 1e4 ;
set.seed(1234)
t <- runif(m, min = 0, max = pi/3)
theta.hat <- mean(sin(t))*(pi/3)
print(c(theta.hat, cos(0)-cos(pi/3)))
## [1] 0.5005534 0.5000000
By running the code, we get the Monte Carlo estimate, We can see that it is very close to the exact value of the integral.
set.seed(1234)
MC.f <- function(t, n = 10000, antithetic = TRUE) {
p <- runif(n/2,0,t)
if (!antithetic) q <- runif(n/2)
else q <- 1 - p #generate antithetic variables
p <- c(p, q)
cdf <- numeric(length(t))
for (i in 1:length(t)) {
g <- t[i] * exp(-p )/(1+p^2)
cdf[i] <- mean(g)
}
cdf
}
m <- 1000
MC <- MCa <- numeric(m)
t <- 1
for (i in 1:m) {
MC[i] <- MC.f(t, n = 1000, anti = FALSE)
#Use Monte Carlo integration without antithetic variables
MCa[i] <- MC.f(t, n = 1000)
#Use Monte Carlo integration with antithetic variables
}
mean(MCa)
## [1] 0.5248848
c(var(MC),var(MCa),var(MC)-var(MCa))
## [1] 6.102502e-05 2.322506e-06 5.870251e-05
By running the code, we get the Monte Carlo integration estimate with antithetic variables,and find the approximate reduction in variance as a percentage of the variance without variance reduction.
M <- 10000; k <- 5
r <- M/k #replicates per stratum
N <- 50 #number of times to repeat the estimation
T <- numeric(k)
S <- numeric(k)
P <- matrix(0, N, 1)
Q <- matrix(0, N, 1)
set.seed(1234)
for (i in 1:N) {
for(j in 1:k){
u <- runif(r,min = j-1, max = j)
#Generate random numbers on 5 intervals Respectively
x <- -log(1-(1-exp(-1))*u/5)
#F(x)=5(1-exp(-x))/(1-exp(-1))
#Generate random samples corresponding to 5 intervals
g <- function(x)((exp(-x)/(1+x^2))/(exp(-x)/(1-exp(-1))))
T[j] <- mean(g(x))#Generate the mean of each interval
S[j] <- sd(g(x))#Generate the standard deviation of each interval
}
P[i,1] <- mean(T)
Q[i,1] <- mean(S)
}
esm <- mean(P)
ess <- mean(Q)
print(c(esm,ess))
## [1] 0.52477952 0.01831754
By running the code, we obtain the stratified importance sampling estimate in Example 5.13,and compare it with the result of Example 5.10,we can find that they all have similar estimates, but the stratified importance sampling estimate has a lower standard deviation.
6.5、Suppose a 95% symmetric t-interval is applied to estimate a mean, but the sample data are non-normal. Then the probability that the confidence interval covers the mean is not necessarily equal to 0.95. Use a Monte Carlo experiment to estimate the coverage probability of the t-interval for random samples of χ2(2) data with sample size n = 20. Compare your t-interval results with the simulation results in Example 6.4. (The t-interval should be more robust to departures from normality than the interval for variance.)
6.6、Estimate the 0.025, 0.05, 0.95, and 0.975 quantiles of the skewness \(\sqrt{b_{1}}\) under normality by a Monte Carlo experiment. Compute the standard error of the estimates from (2.14) using the normal approximation for the density (with exact variance formula). Compare the estimated quantiles with the quantiles of the large sample approximation \(\sqrt{b_{1}}\) ≈ N (0, 6/n).
n <- 20
alpha <- .05
m <- 10000
set.seed(123)
xbar <- numeric(m)
es <- numeric(m)
for (i in 1:m) {
xbar[i] <- mean(rchisq(n, df = 2))
es[i] <- qt((1-alpha/2),n-1)*sd(rchisq(n, df = 2))/sqrt(n)
}
# The symmetric t-interval to estimate a mean is xbar +(-) t(1-α/2)(n-1)s/sqrt(n)
p <- mean((xbar-es)<2 & 2<(xbar+es))
#Judging whether the mean of the populations falling within the confidence interval generated by the samples subject to the chi-square distribution, thus obtaining the probability that the confidence interval covers the mean
p
## [1] 0.9241
By running the code,we can find The t-interval is more robust to departures from normality than the interval for variance.
n<-1000
v <- xbar <- m3 <- m2 <- numeric(n)
m <- 1000
set.seed(12345)
for (i in 1:n) {
x <- rnorm(m,mean = 0, sd = sqrt(6/n))
xbar[i] <- mean(x)
m3[i] <- mean((x - xbar[i])^3)
m2[i] <- mean((x - xbar[i])^2)
v[i] <- m3[i] / ((m2[i])^1.5)
}
u <- v[order(v)]
#Generate quantiles through empirical distribution
#Produce skewness coefficients from 1000 sets of samples from a normal distribution,and sort the 1000 data in ascending order to get the quantile
q <- c(0.025,0.05,0.95,0.975)
gq <- lsq <- f <- ssd <- numeric(4)
#gq represents the estimated value corresponding to 0.025,0.05 ,0.95,0.975 quantile of the skewness sqrt(b1) under normality by a Monte Carlo experiment.
#lsq represents quantiles of the large sample approximation sqrt(b1) ≈ N (0, 6/n).
#ssd represents the standard error of the estimates from (2.14) using the normal approximation for the density (with exact variance formula)
for (j in 1:4) {
gq[j] <- u[n*q[j]]
lsq[j] <- qnorm(q[j],mean = 0,sd = sqrt(6/n))
f[j] <- (1/sqrt(2*pi*(6/n)))*exp(-gq[j]^2/(2*(6/n)))
ssd[j] <- sqrt(q[j]*(1-q[j])/(n*f[j]^2))
#ssd=sqrt(q*(1-q)/(n*f(xq)^2))
}
comparison <- data.frame(gq,lsq,ssd)
knitr::kable(comparison)
| gq | lsq | ssd |
|---|---|---|
| -0.1500233 | -0.1518182 | 0.0062545 |
| -0.1284425 | -0.1274098 | 0.0052915 |
| 0.1203948 | 0.1274098 | 0.0044782 |
| 0.1544270 | 0.1518182 | 0.0069938 |
By running the code, we can find that the estimated quantiles is very close to the estimated quantiles with the quantiles of the large sample approximation \(\sqrt{b_{1}}\) ≈ N (0, 6/n).The standard error of the estimates is quite small.
6.7 Estimate the power of the skewness test of normality against symmetric Beta(α, α) distributions and comment on the results. Are the results different for heavy-tailed symmetric alternatives such as t(ν)?
6.A Use Monte Carlo simulation to investigate whether the empirical Type I error rate of the t-test is approximately equal to the nominal significance level α, when the sampled population is non-normal. The t-test is robust to mild departures from normality. Discuss the simulation results for the cases where the sampled population is (i) \(\chi^{2}(1)\), (ii) Uniform(0,2), and (iii) Exponential(rate=1). In each case, test \(H_{0}: \mu=\mu_{0}\) vs \(H_{0}: \mu \neq \mu_{0}\), where \(\mu_{0}\) is the mean of \(\chi^{2}(1)\), Uniform(0,2), and Exponential(1), respectively.
** Discussion(homework)
alpha <- 20
n <- c(10,20,30,50,100,500) #sample sizes
p.reject <- p.heavy <- cv <- xbar <-ybar <- m31 <- m32 <-
m21 <- m22 <- u <- v <- numeric(length(n))
#to store sim. results
m <- 10000 #num. repl. each sim.
sktests <- heavy <- numeric(m) #test decisions
set.seed(12345)
for (i in 1:length(n)) {
for (j in 1:m) {
cv[i] <- qnorm(.975, 0,sqrt(6*(n[i]-2) / ((n[i]+1)*(n[i]+3))))
#crit. values for each n
x <- rbeta(n[i],alpha,alpha)
y <- rt(n[i],2)
xbar[i] <- mean(x)
ybar[i] <- mean(y)
m31[i] <- mean((x - xbar[i])^3)
m32[i] <- mean((y - ybar[i])^3)
m21[i] <- mean((x - xbar[i])^2)
m22[i] <- mean((y - ybar[i])^2)
u[i] <- m31[i] / ((m21[i])^1.5)
v[i] <- m32[i] / ((m22[i])^1.5)
sktests[j] <- as.integer(abs(u[i])>= cv[i] )
heavy[j] <- as.integer(abs(v[i])>= cv[i] )
}
p.reject[i] <- mean(sktests) #proportion rejected
p.heavy[i] <- mean(heavy)
}
comparison <- data.frame(n,p.reject, p.heavy)
knitr::kable(comparison)
| n | p.reject | p.heavy |
|---|---|---|
| 10 | 0.0507 | 0.3309 |
| 20 | 0.0436 | 0.5021 |
| 30 | 0.0389 | 0.5988 |
| 50 | 0.0356 | 0.7130 |
| 100 | 0.0346 | 0.8128 |
| 500 | 0.0268 | 0.9405 |
This is the skewness test of normality in the case of small sample, as can be seen from the table above,the estimated value of power is very accurate when sample is small,and the estimates are getting worse as the sample size increases.
alpha <- 20
n <- c(1000,1500,2000) #sample sizes
p.reject <- p.heavy <- cv <- xbar <-ybar <- m31 <- m32 <-
m21 <- m22 <- u <- v <- numeric(length(n))
#to store sim. results
m <- 10000 #num. repl. each sim.
sktests <- heavy <- numeric(m) #test decisions
set.seed(1234)
for (i in 1:length(n)) {
for (j in 1:m) {
cv[i] <- qnorm(.975, 0, sqrt(6/n[i]))
#crit. values for each n
x <- rbeta(n[i],alpha,alpha)
y <- rt(n[i],2)
xbar[i] <- mean(x)
ybar[i] <- mean(y)
m31[i] <- mean((x - xbar[i])^3)
m32[i] <- mean((y - ybar[i])^3)
m21[i] <- mean((x - xbar[i])^2)
m22[i] <- mean((y - ybar[i])^2)
u[i] <- m31[i] / ((m21[i])^1.5)
v[i] <- m32[i] / ((m22[i])^1.5)
sktests[j] <- as.integer(abs(u[i])>= cv[i] )
heavy[j] <- as.integer(abs(v[i])>= cv[i] )
}
p.reject[i] <- mean(sktests) #proportion rejected
p.heavy[i] <- mean(heavy)
}
comparison <- data.frame(n,p.reject, p.heavy)
knitr::kable(comparison)
| n | p.reject | p.heavy |
|---|---|---|
| 1000 | 0.0287 | 0.9660 |
| 1500 | 0.0289 | 0.9736 |
| 2000 | 0.0292 | 0.9781 |
This is the skewness test of normality in the case of large sample, as can be seen from the table above, we can find the estimates are not very accurate when sample size is very large.
alpha1 <- 4
alpha2 <- 10
n <- 300
m <- 1500
set.seed(1234)
epsilon <- c(seq(0, .15, .01), seq(.15, 1, .05))
N <- length(epsilon)
pwr <- numeric(N)
#critical value for the skewness test
cv <- qnorm(0.975, 0, sqrt(6*(n-2) / ((n+1)*(n+3))))
for (j in 1:N) { #for each epsilon
e <- epsilon[j]
sktests <- xbar <- m3 <- m2 <- sk <- numeric(m)
for (i in 1:m) { #for each replicate
alpha <- sample(c(alpha1, alpha2), replace = TRUE,
size = n, prob = c(1-e, e))
x <- rbeta(n, alpha, alpha)
xbar[i] <- mean(x)
m3[i] <- mean((x-xbar[i])^3)
m2[i] <- mean((x-xbar[i])^2)
sk[i] <- m3[i] / ((m2[i])^1.5)
sktests[i] <- as.integer(abs(sk[i]) >= cv) }
pwr[j] <- mean(sktests)
}
#plot power vs epsilon
plot(epsilon, pwr, type = "b",
xlab = bquote(epsilon))
abline(h = .1, lty = 3)
se <- sqrt(pwr * (1-pwr) / m) #add standard errors
lines(epsilon, pwr+se, lty = 3,col = "red")
lines(epsilon, pwr-se, lty = 3,col = "red")
The estimates of the power of the skewness test of normality against symmetric Beta(α,α) distributions can be seen from the figure above.The entire image is increasing before 0.8, and basically a downward trend between 0.8 and 1.
n1 <- 4
n2 <- 40
n <- 300
m <- 1500
set.seed(1234)
epsilon <- c(seq(0, .15, .01), seq(.15, 1, .05))
N <- length(epsilon)
pwr <- numeric(N)
#critical value for the skewness test
cv <- qnorm(0.975, 0, sqrt(6*(n-2) / ((n+1)*(n+3))))
for (j in 1:N) { #for each epsilon
e <- epsilon[j]
sktests <- xbar <- m3 <- m2 <- sk <- numeric(m)
for (i in 1:m) { #for each replicate
nn <- sample(c(n1, n2), replace = TRUE,
size = n, prob = c(1-e, e))
x <- rt(n, nn)
xbar[i] <- mean(x)
m3[i] <- mean((x-xbar[i])^3)
m2[i] <- mean((x-xbar[i])^2)
sk[i] <- m3[i] / ((m2[i])^1.5)
sktests[i] <- as.integer(abs(sk[i]) >= cv) }
pwr[j] <- mean(sktests)
}
#plot power vs epsilon
plot(epsilon, pwr, type = "b",
xlab = bquote(epsilon))
abline(h = .1, lty = 3)
se <- sqrt(pwr * (1-pwr) / m) #add standard errors
lines(epsilon, pwr+se, lty = 3,col = "red")
lines(epsilon, pwr-se, lty = 3,col = "red")
Similarly,we have obtained corresponding estimates in the case of t distribution, which can be seen from the above figure. By comparing the above two figures, we can find the results are different for heavy-tailed symmetric alternatives such as t(v) and the entire image is basically showing a downward trend.
n <- 200
alpha <- c(0.01, 0.05, 0.1)
#Investigating whether the empirical Type I er- ror rate of the t-test is approximately equal to the nominal significance level at different nominal level α, when the sampled population is non-normal.
mu01 <- mu02 <- mu03 <- 1
#All of the means of the three distributions are 1
m <- 10000 #number of replicates
p1 <- p2 <- p3 <- matrix(0,3,m) #storage for p-values
p1.hat <- p2.hat <- p3.hat <- numeric(3)
#storage for the observed Type I error rate
se1.hat <- se2.hat <- se3.hat <- numeric(3)
#storage for the standard error of the estimate
set.seed(1234)
for (i in 1:3) {
for (j in 1:m) {
x <- rchisq(n,1)
y <- runif(n,min = 0, max = 2)
z <- rexp(n,1)
ttest1 <- t.test(x, alternative = "two.side", mu = mu01)
ttest2 <- t.test(y, alternative = "two.side", mu = mu02)
ttest3 <- t.test(z, alternative = "two.side", mu = mu03)
p1[i,j] <- ttest1$p.value
p2[i,j] <- ttest2$p.value
p3[i,j] <- ttest3$p.value
}
p1.hat[i] <- mean(p1[i,] <= alpha[i])
se1.hat[i] <- sqrt(p1.hat[i] * (1 - p1.hat[i]) / m)
p2.hat[i] <- mean(p2[i,] <= alpha[i])
se2.hat[i] <- sqrt(p2.hat[i] * (1 - p2.hat[i]) / m)
p3.hat[i] <- mean(p3[i,] <= alpha[i])
se3.hat[i] <- sqrt(p3.hat[i] * (1 - p3.hat[i]) / m)
}
Sampled.population <- c("chisq(1)", "Uniform(0,2)", "exp(1)",
"chisq(1)", "Uniform(0,2)", "exp(1)",
"chisq(1)", "Uniform(0,2)", "exp(1)")
Nominal.level <- c(alpha[1], alpha[1], alpha[1],
alpha[2], alpha[2], alpha[2],
alpha[3], alpha[3], alpha[3])
Observed.t1e <- c(p1.hat[1], p2.hat[1], p3.hat[1],
p1.hat[2], p2.hat[2], p3.hat[2],
p1.hat[3], p2.hat[3], p3.hat[3])
Estimate.se <- c(se1.hat[1], se2.hat[1], se3.hat[1],
se1.hat[2], se2.hat[2], se3.hat[2],
se1.hat[3], se2.hat[3], se3.hat[3])
result <- data.frame(Sampled.population, Nominal.level, Observed.t1e, Estimate.se)
knitr::kable(result)
| Sampled.population | Nominal.level | Observed.t1e | Estimate.se |
|---|---|---|---|
| chisq(1) | 0.01 | 0.0158 | 0.0012470 |
| Uniform(0,2) | 0.01 | 0.0116 | 0.0010708 |
| exp(1) | 0.01 | 0.0132 | 0.0011413 |
| chisq(1) | 0.05 | 0.0574 | 0.0023261 |
| Uniform(0,2) | 0.05 | 0.0505 | 0.0021897 |
| exp(1) | 0.05 | 0.0567 | 0.0023127 |
| chisq(1) | 0.10 | 0.1094 | 0.0031214 |
| Uniform(0,2) | 0.10 | 0.0968 | 0.0029569 |
| exp(1) | 0.10 | 0.0984 | 0.0029785 |
As can be seen from the table, we can find the empirical Type I error rate of the t-test is approximately equal to the nominal significance level α, when the sampled population is non-normal(such as \(\chi^{2}(1)\), Uniform(0,2) and Exponential(rate=1). The t-test is robust to mild departures from normality.
We can’t say the powers are different at 0.05 level.
7.6、Efron and Tibshirani discuss the scor (bootstrap) test score data on 88 students who took examinations in five subjects [84, Table 7.1], [188, Table 1.2.1].The first two tests (mechanics, vectors) were closed book and the last three tests (algebra, analysis, statistics) were open book. Each row of the data frame is a set of scores \(\left(x_{i 1}, \ldots, x_{i 5}\right) \text { for the } i^{t h}\) student. Use a panel display to display the scatter plots for each pair of test scores. Compare the plot with the sample correlation matrix. Obtain bootstrap estimates of the standard errors for each of the following estimates:\[ \begin{array}{l}{\hat{\rho}_{12}=\hat{\rho}(\mathrm{mec}, \mathrm{vec}), \hat{\rho}_{34}=\hat{\rho}(\mathrm{alg},} \ { \text { ana } ), \hat{\rho}_{35}=\hat{\rho}(\mathrm{alg}, \mathrm{sta}), \hat{\rho}_{45}=\hat{\rho}(\mathrm{ana}, \mathrm{sta})}\end{array} \]
7.B、Repeat Project 7.A for the sample skewness statist\(\chi^{2}(5)\) distributions (positive skewness).
library(bootstrap)
library(boot)
s <- original <- bias <- se <- numeric(4)
par(mar=c(1,1,1,1))
opar <- par(no.readonly = TRUE)
par(mfrow = c(3,4))
for (i in 1:4) {
for (j in (i+1):5) {
plot(scor[[i]],scor[[j]],"p",
xlab = colnames(scor)[i], ylab = colnames(scor)[j], col = "blue")
}
}
par(opar)
cor(scor) #generate the sample correlation matrix.
## mec vec alg ana sta
## mec 1.0000000 0.5534052 0.5467511 0.4093920 0.3890993
## vec 0.5534052 1.0000000 0.6096447 0.4850813 0.4364487
## alg 0.5467511 0.6096447 1.0000000 0.7108059 0.6647357
## ana 0.4093920 0.4850813 0.7108059 1.0000000 0.6071743
## sta 0.3890993 0.4364487 0.6647357 0.6071743 1.0000000
p <- c(1,3,3,4)
q <- c(2,4,5,5)
for (k in 1:4) {
b.cor <- function(scor,i) cor(scor[i,p[k]],scor[i,q[k]])
obj <- boot(data = scor, statistic = b.cor, R = 2000)
round(c(original[k] <- obj$t0, bias[k] <- mean(obj$t) - obj$t0, se[k] <- sd(obj$t)),3)
}
estimates <- c("mec~vec", "alg~ana", "alg~sta","ana~sta")
result <- data.frame(estimates,original, bias, se)
knitr::kable (result)
| estimates | original | bias | se |
|---|---|---|---|
| mec~vec | 0.5534052 | -0.0040153 | 0.0766079 |
| alg~ana | 0.7108059 | 0.0006191 | 0.0478017 |
| alg~sta | 0.6647357 | 0.0004038 | 0.0602799 |
| ana~sta | 0.6071743 | 0.0001693 | 0.0677358 |
Comparing the scatter plot with the sample correlation matrix, it can be seen that the correlation coefficient between each pair of variables is consistent with the correlation trend of the scatter plots they form.The bootstrap estimates of the standard errors for each of the estimates are relatively small.
skewness <- function(x,i) {
#computes the sample skewness coeff.
xbar <- mean(x[i])
m3 <- mean((x[i] - xbar)^3)
m2 <- mean((x[i] - xbar)^2)
return( m3 / m2^1.5 )
}
s <- 0
n <- 20
m <- 1000
set.seed(1234)
library(boot)
nor.norm <- nor.basic <- nor.perc <- matrix(0, m, 2)
for (i in 1:m) {
data.nor <- rnorm(n, 0, 4)
nor.ske <- boot(data.nor, statistic = skewness, R=1000)
nor <- boot.ci(nor.ske, type=c("norm","basic","perc"))
nor.norm[i,] <- nor$norm[2:3]
nor.basic[i,] <- nor$basic[4:5]
nor.perc[i,] <- nor$percent[4:5]
}
#Calculate the coverage probability of a normal distribution
norm1 <- mean(nor.norm[,1] <= s & nor.norm[,2] >= s)
basic1 <- mean(nor.basic[,1] <= s & nor.basic[,2] >= s)
perc1 <- mean(nor.perc[,1] <= s & nor.perc[,2] >= s)
#Calculate the probability of the left side of the normal distribution
norm1.left <- mean(nor.norm[,1] >= s )
basic1.left <- mean(nor.basic[,1] >= s )
perc1.left <- mean(nor.perc[,1] >=s )
#Calculate the right side probability of a normal distribution
norm1.right <- mean(nor.norm[,2] <= s )
basic1.right <- mean(nor.basic[,2] <= s )
perc1.right <- mean(nor.perc[,2] <= s)
s<-sqrt(8/5)
n<-20
m<-1000
set.seed(12345)
library(boot)
chi.norm<-chi.basic<-chi.perc<-matrix(0, m, 2)
for (i in 1:m) {
data.chisq<-rchisq(n,5)
chisq.ske<-boot(data.chisq,statistic=skewness, R=1000)
chi<- boot.ci(chisq.ske,type=c("norm","basic","perc"))
chi.norm[i,]<-chi$norm[2:3];
chi.basic[i,]<-chi$basic[4:5];
chi.perc[i,]<-chi$percent[4:5];
}
#Calculate the coverage probability of the chi-square distribution
norm2 <- mean(chi.norm[,1] <= s & chi.norm[,2] >= s)
basic2 <- mean(chi.basic[,1] <= s & chi.basic[,2] >= s)
perc2 <- mean(chi.perc[,1] <= s & chi.perc[,2] >= s)
#Calculate the probability of the left side of the chi-square distribution
norm2.left <- mean(chi.norm[,1] >= s )
basic2.left <-mean(chi.basic[,1] >= s )
perc2.left <- mean(chi.perc[,1] >=s )
#Calculate the right side probability of the chi-square distribution
norm2.right <- mean(chi.norm[,2] <= s )
basic2.right <- mean(chi.basic[,2] <= s )
perc2.right <- mean(chi.perc[,2] <= s)
Distribution <- c("N(0,16)","chisq(5")
Type <- c("basic", "norm", "perc")
Left <- c(norm1.left, norm2.left, basic1.left,
basic2.left, perc1.left, perc2.left)
Right <- c(norm1.right, norm2.right, basic1.right,
basic2.right, perc1.right, perc2.right)
P.coverage <- c(norm1, norm2, basic1, basic2, perc1, perc2)
result <- data.frame(Distribution, Type, Left, Right, P.coverage)
knitr::kable(result)
| Distribution | Type | Left | Right | P.coverage |
|---|---|---|---|---|
| N(0,16) | basic | 0.049 | 0.053 | 0.898 |
| chisq(5 | norm | 0.021 | 0.238 | 0.741 |
| N(0,16) | perc | 0.052 | 0.058 | 0.890 |
| chisq(5 | basic | 0.032 | 0.251 | 0.717 |
| N(0,16) | norm | 0.012 | 0.013 | 0.975 |
| chisq(5 | perc | 0.000 | 0.280 | 0.720 |
After repeating Project 7.A for the sample skewness statistic,we can find that in the case of the size of sample is 20, the coverage probabilities of the 3 types of confidence intervals (the standard normal bootstrap confidence interval, the basic bootstrap confidence interval, and the percentile confidence interval.) are very close to 0.9 in the normal distribution (skewness 0) and are very close to 0.7 in \(\chi^{2}(5)\) distribution (positive skewness).And we can get the probability that the confidence intervals miss on the left, and the probability that the confidence intervals miss on the right from the table.
7.8、 Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of \(\hat{\theta}\).
7.10、 In Example 7.18, leave-one-out (n-fold) cross validation was used to select the best fitting model. Repeat the analysis replacing the Log-Log model with a cubic polynomial model. Which of the four models is selected by the cross validation procedure? Which model is selected according to maximum adjusted \(R^{2}\)?
library(bootstrap)
sigma <- cov(scor)
s <- eigen(sigma)
v <- s$values[order(-(s$values))]
theta <- v[1]/sum(v)
#measures the proportion of variance explained by the first principal component of the original sample.
n <- 88
theta.hat <- numeric(n)
for (i in 1:n) {
scor.jack <- scor[-i, ]
sigma.hat <- cov(scor.jack)
ss <- eigen(sigma.hat)
vv <- ss$values[order(-(ss$values))]
theta.hat[i] <- vv[1]/sum(vv)
}
bias.jack <- (n-1)*(mean(theta.hat)-theta)
#Obtain the jackknife estimates of bias of θ.hat
se.jack <- sqrt((n-1)*mean((theta.hat-theta)^2))
#Obtain the jackknife estimates of standard error of θ.hat
round(c(original = theta, bias.jack = bias.jack, se.jack = se.jack), 4)
## original bias.jack se.jack
## 0.6191 0.0011 0.0496
After running the code, we can obtain the jackknife estimates of bias and standard error of \(\hat{\theta}\), and we can find the bias is very small.
library(DAAG,quietly=TRUE)
##
## 载入程辑包:'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
attach(ironslag)
n <- length(magnetic) #in DAAG ironslag
e1 <- e2 <- e3 <- e4 <- numeric(n)
yhat1 <- yhat2 <- yhat3 <- yhat4 <- numeric(n)
SSR1 <- SSR2 <- SSR3 <- SSR4 <- numeric(n)
SST1 <- SST2 <- SST3 <- SST4 <- numeric(n)
ybar <- mean(magnetic)
# for n-fold cross validation
# fit models on leave-one-out samples
for (k in 1:n) {
y <- magnetic[-k]
x <- chemical[-k]
J1 <- lm(y ~ x) #Linear model
yhat1[k] <- J1$coef[1] + J1$coef[2] * chemical[k]
e1[k] <- magnetic[k] - yhat1[k]
SSR1[k] <- (yhat1[k]-ybar)^2
SST1[k] <- (magnetic[k]-ybar)^2
J2 <- lm(y ~ x + I(x^2)) #Quadratic model
yhat2[k] <- J2$coef[1] + J2$coef[2] * chemical[k] +
J2$coef[3] * chemical[k]^2
e2[k] <- magnetic[k] - yhat2[k]
SSR2[k] <- (yhat2[k]-ybar)^2
SST2[k] <- (magnetic[k]-ybar)^2
J3 <- lm(log(y) ~ x) #Exponential model
logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[k]
yhat3[k] <- exp(logyhat3)
e3[k] <- magnetic[k] - yhat3[k]
SSR3[k] <- (yhat3[k]-ybar)^2
SST3[k] <- (magnetic[k]-ybar)^2
J4 <- lm(y ~ x + I(x^2) + I(x^3)) #Cubic polynomial model.
yhat4[k] <- J4$coef[1] + J4$coef[2] * chemical[k] +
J4$coef[3] * chemical[k]^2 + J4$coef[4] * chemical[k]^3
e4[k] <- magnetic[k] - yhat4[k]
SSR4[k] <- (yhat4[k]-ybar)^2
SST4[k] <- (magnetic[k]-ybar)^2
}
c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2))
## [1] 19.55644 17.85248 18.44188 18.17756
After calculating the corresponding average squared prediction error of the 4 models, we can find that the quadratic model of the four models is selected by the cross validation procedure due to its lowest average squared prediction error.
c(51/50*sum(SSR1)/sum(SST1), 51/49*sum(SSR2)/sum(SST2),
51/50*sum(SSR3)/sum(SST3), 51/48*sum(SSR4)/sum(SST4))
## [1] 0.5450654 0.5944676 0.4676507 0.5818290
# R^2=(n-1)/(n-p-1)*SSR/SST n=52
The quadratic model has the largest \(R^{2}\), so it is selected according to maximum adjusted \(R^{2}\).
8.3、 The Count 5 test for equal variances in Section 6.4 is based on the maximum number of extreme points. Example 6.15 shows that the Count 5 criterion is not applicable for unequal sample sizes. Implement a permutation test for equal variance based on the maximum number of extreme points that applies when sample sizes are not necessarily equal.
Q2:
Power comparison (distance correlation test versus ball covariance test)
n1 <- 100; n2 <- 150
set.seed(12345)
m <- 500
count5test <- function(x, y, s) {
X <- x - mean(x)
Y <- y - mean(y)
outx <- sum(X > max(Y)) + sum(X < min(Y))
outy <- sum(Y > max(X)) + sum(Y < min(X))
# return 1 (reject) or 0 (do not reject H0)
return(as.integer(max(c(outx, outy)) > s))
}
x <- rnorm(n1)
y <- rnorm(n2)
s <- 5:15
R <- 100
q <- numeric(R)
alphahat <- pwr <- numeric(length(s))
for (j in 1:length(s)) {
ss <- s[j]
alphahat[j] <- count5test(x, y, ss)
z <- c(x, y)
K <- 1:(n1+n2); n<-length(x)
for (i in 1:R) {
k <- sample(K, size = n, replace = FALSE)
x1 <- z[k]; y1 <- z[-k] #complement of x1
x1 <- x1 - mean(x1)
#centered by sample mean
y1 <- y1 - mean(y1)
q[i] <- count5test(x1, y1, ss)
}
pwr[j] <- mean(c(alphahat[j], q))
}
plot(s, pwr, col = "red")
The power value will decrease as the number of extreme samples increases when the sample is fixed.
library(boot)
library(MASS)
##
## 载入程辑包:'MASS'
## The following object is masked from 'package:DAAG':
##
## hills
library(Ball)
dCov <- function(x, y) {
x <- as.matrix(x); y <- as.matrix(y)
n <- nrow(x); m <- nrow(y)
if (n != m || n < 2) stop("Sample sizes must agree")
if (! (all(is.finite(c(x, y)))))
stop("Data contains missing or infinite values")
Akl <- function(x) {
d <- as.matrix(dist(x))
m <- rowMeans(d); M <- mean(d)
a <- sweep(d, 1, m); b <- sweep(a, 2, m)
b + M
}
A <- Akl(x); B <- Akl(y)
sqrt(mean(A * B))
}
ndCov2 <- function(z, ix, dims) {
#dims contains dimensions of x and y
p <- dims[1]
q <- dims[2]
d <- p + q
x <- z[ , 1:p] #leave x as is
y <- z[ix, -(1:p)] #permute rows of y
return(nrow(z) * dCov(x, y)^2)
}
set.seed(123)
ss <- seq(10, 200, 10)
N <- 50
P <- matrix(0, nrow = length(ss), ncol = 2)
for(j in 1:N) {
p <- matrix(0, nrow = length(ss), ncol = 2)
for(i in 1:length(ss)) {
# data generate
n <- ss[i]
x <- mvrnorm(n, c(0,0), matrix(c(1,0,0,1), nrow = 2))
e <- mvrnorm(n, c(0,0), matrix(c(1,0,0,1), nrow = 2))
# Model2
y <- x / 4 + e
z <- cbind(x, y)
# Ball test
p[i, 1] <- bcov.test(z[ ,1:2], z[ ,3:4], num.permutations=50, seed=i*j)$p.value
# permutatin: resampling without replacement
boot.obj <- boot(data = z, statistic = ndCov2, R = 100,
sim = "permutation", dims = c(2, 2))
tb <- c(boot.obj$t0, boot.obj$t)
p[i, 2] <- mean(tb >= tb[1])
}
P <- P + (p <= 0.05)
}
P <- P / N
plot(ss, P[, 1], "b", col = "green", ylim = c(0, 1), ylab = "power1", main = "Model1: Power")
lines(ss, P[, 2], "b", col = "red")
legend("topleft", legend = c("Ball", "Cor"), lty = 1, col = c("green", "red"))
P <- matrix(0, nrow = length(ss), ncol = 2)
for(j in 1:N) {
p <- matrix(0, length(ss), 2)
for(i in 1:length(ss)) {
# data generate
n <- ss[i]
x <- mvrnorm(n, c(0,0), matrix(c(1,0,0,1), nrow = 2))
e <- mvrnorm(n, c(0,0), matrix(c(1,0,0,1), nrow = 2))
# Model2
y <- x / 4 * e
z <- cbind(x, y)
# Ball test
p[i, 1] <- bcov.test(z[, 1:2], z[, 3:4],
num.permutations = 50, seed=i*j)$p.value
# permutatin: resampling without replacement
boot.obj <- boot(data = z, statistic = ndCov2, R = 100,
sim = "permutation", dims = c(2, 2))
tb <- c(boot.obj$t0, boot.obj$t)
p[i, 2] <- mean(tb >= tb[1])
}
P <- P + (p <= 0.05)
}
P <- P / N
plot(ss, P[, 1], "b", col = "green", ylim = c(0, 1), ylab = "power2", main = "Model2: Power")
lines(ss, P[, 2], "b", col = "red")
legend("bottomright", legend = c("Ball", "Cor"), lty = 1, col = c("green", "red"))
The power value of distance correlation test and ball covariance test will increase as the sample size increases, and the power value of the former is higher than the power value of the latter in model 1, and exactly the opposite in model 2.
9.4 Implement a random walk Metropolis sampler for generating the standard Laplace distribution (see Exercise 3.2). For the increment, simulate from a normal distribution. Compare the chains generated when different variances are used for the proposal distribution. Also, compute the acceptance rates of each chain.
set.seed(12345)
rw.Metropolis <- function(sigma, x0, N) {
x <- numeric(N)
x[1] <- x0
u <- runif(N)
k <- 0
for (i in 2:N) {
y <- rnorm(1, x[i-1], sigma)
if (u[i] <= ((0.5*exp(-abs(y))) / (0.5*exp(-abs(x[i-1])))))
# Target distribution: The density of the standard Laplace distribution is f(x) = 0.5*exp(-abs(x))
x[i] <- y
else {
x[i] <- x[i-1]
k <- k + 1
}
}
return(list(x=x, k=k))
}
N <- 4000
sigma <- c(.05, .5, 5, 20)
x0 <- 10
rw1 <- rw.Metropolis(sigma[1], x0, N)
rw2 <- rw.Metropolis(sigma[2], x0, N)
rw3 <- rw.Metropolis(sigma[3], x0, N)
rw4 <- rw.Metropolis(sigma[4], x0, N)
#number of candidate points rejected
accept.rate <- c(1-(rw1$k)/N, 1-(rw2$k)/N, 1-(rw3$k)/N, 1-(rw4$k)/N)
accept <- data.frame(sigma = sigma, no.reject=c(rw1$k, rw2$k, rw3$k, rw4$k), accept.rate = accept.rate)
knitr::kable(accept)
| sigma | no.reject | accept.rate |
|---|---|---|
| 0.05 | 74 | 0.98150 |
| 0.50 | 698 | 0.82550 |
| 5.00 | 2875 | 0.28125 |
| 20.00 | 3694 | 0.07650 |
The rate of acceptance decreases as the standard deviation of the Proposal distribution increases.
par(mar=c(1,1,1,1))
par(mfrow=c(2,2)) #display 4 graphs together
refline <- c(log(0.05), -log(0.05))
rw <- cbind(rw1$x, rw2$x, rw3$x, rw4$x)
for (j in 1:4) {
plot(rw[,j], type="l",
xlab=bquote(sigma == .(round(sigma[j],3))),
ylab="X", ylim=range(rw[,j]))
abline(h=refline)
}
par(mar=c(1,1,1,1))
par(mfrow=c(1,1)) #reset to default
a <- c(.05, seq(.1, .9, .1), .95)
Q <- numeric(length(a))
for (i in 1:11) {
if(i <=6)
Q[i] <- log(2*a[i])
else
Q[i] <- -log(2 - 2*a[i])
}
#Calculate the corresponding quantile points according to the Laplace distribution function
rw <- cbind(rw1$x, rw2$x, rw3$x, rw4$x)
mc <- rw[501:N, ]
Qrw <- apply(mc, 2, function(x) quantile(x, a))
qq <- data.frame(round(cbind(Q, Qrw), 3))
names(qq) <- c('True','sigma=0.05','sigma=0.5','sigma=2','sigma=16')
knitr::kable(qq)
| True | sigma=0.05 | sigma=0.5 | sigma=2 | sigma=16 | |
|---|---|---|---|---|---|
| 5% | -2.303 | 2.633 | -1.743 | -2.403 | -2.017 |
| 10% | -1.609 | 3.439 | -1.337 | -1.601 | -1.362 |
| 20% | -0.916 | 4.450 | -0.815 | -0.973 | -0.660 |
| 30% | -0.511 | 5.741 | -0.425 | -0.547 | -0.368 |
| 40% | -0.223 | 6.082 | -0.165 | -0.236 | -0.224 |
| 50% | 0.000 | 6.278 | 0.019 | -0.017 | 0.092 |
| 60% | 0.223 | 6.418 | 0.238 | 0.186 | 0.255 |
| 70% | 0.511 | 6.569 | 0.527 | 0.381 | 0.441 |
| 80% | 0.916 | 6.938 | 0.944 | 0.772 | 0.741 |
| 90% | 1.609 | 8.169 | 1.609 | 1.407 | 0.996 |
| 95% | 2.303 | 8.391 | 2.079 | 2.086 | 1.609 |
Comparing the figure and the table, it can be seen that when the standard deviation is taken as 0.5 and 5, the overall effect is better.Moreover,the latter is relatively better than the former ## Question 11.1、 The natural logarithm and exponential functions are inverses of each other, so that mathematically log(exp x) = exp(log x) = x. Show by example that this property does not hold exactly in computer arithmetic. Does the identity hold with near equality? (See all.equal.) 11.5 、Write a function to solve the equation\[ \begin{aligned} \frac{2 \Gamma\left(\frac{k}{2}\right)}{\sqrt{\pi(k-1)} \Gamma\left(\frac{k-1}{2}\right)} \int_{0}^{c_{k-1}}\left(1+\frac{u^{2}}{k-1}\right)^{-k / 2} d u & = & \frac{2 \Gamma\left(\frac{k+1}{2}\right)}{\sqrt{\pi k} \Gamma\left(\frac{k}{2}\right)} \int_{0}^{c_{k}}\left(1+\frac{u^{2}}{k}\right)^{-(k+1) / 2} d u \end{aligned} \] for a, where \[ c_{k}=\sqrt{\frac{a^{2} k}{k+1-a^{2}}} \] Compare the solutions with the points A(k) in Exercise 11.4. * A-B-O blood type problem Let the three alleles be A, B and O with allele frequencies p, q and r. The 6 genotype frequencies under HWE and complete counts are as follows. Genotype AA BB OO AO BO AB Sum Frequency p^2 q^2 r^2 2pr 2qr 2pq 1 Count nAA nBB nOO nAO nBO nAB n Observed data: nA· = nAA + nAO = 28 (A-type), nB· = nBB + nBO = 24 (B-type), nOO = 41 (O-type), nAB = 70 (AB-type). 1. Use EM algorithm to solve MLE of p and q (consider missing data nAA and nBB). 2. Show that the log-maximum likelihood values in M-steps are increasing via line plot. ## Exercise 11.1
x <- seq(1, 22, 1)
log(exp(x)) == exp(log(x))
## [1] TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [12] TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
In these examples, this property does not always hold exactly in computer arithmetic.
isTRUE(all.equal(log(exp(x)),exp(log(x))))
## [1] TRUE
The identity hold with near equality
m <- c(5:25, 100, 500, 1000)
n <- c(4:25, 100)
A <- numeric(24)
B <- numeric(23)
#Exercise 11.4
for (k in m) {
f <- function(x){
return(exp(log(2) + lgamma(x/2) - (1/2)*(log(pi*(x-1))) - lgamma((x-1)/2)))
}
#Coefficient before integration
g <- function(a){
return((f(k)*integrate(function(x)(1 + (x^2)/(k-1))^(-k/2),
lower = sqrt((a^2)*(k-1)/(k-a^2)), upper = Inf)$value) - (f(k+1)*integrate(function(x)(1 + (x^2)/k)^(-(k+1)/2),
lower = sqrt((a^2)*k/(k+1-a^2)), upper = Inf)$value))
}
A[k] <- uniroot(g, lower = 0.01, upper = 1+sqrt(k)/2)$root
}
#Find the intersection points A(k) for k = 4 : 25, 100, 500, 1000
#Exercise 11.5
for (k in n) {
f <- function(x){
return(exp(log(2) + lgamma(x/2) - (1/2)*(log(pi*(x-1))) - lgamma((x-1)/2)))
}
h <- function(a){
return((f(k)*integrate(function(x)(1 + (x^2)/(k-1))^(-k/2), lower = 0, upper = sqrt((a^2)*(k-1)/(k-a^2)))$value) - (f(k+1)*integrate(function(x)(1 + (x^2)/k)^(-(k+1)/2), lower = 0, upper = sqrt((a^2)*k/(k+1-a^2)))$value))
}
B[k] <- uniroot(h, lower = 0.01, upper = 1+sqrt(k)/2)$root
}
#Find the intersection points B(k) for k = 4 : 25, 100, 500, 1000
A <- c(NA, A[m])
B <- c(B[n], NA, NA)
k <- c(4:25, 100, 500, 1000)
result <- data.frame(k, A, B)
knitr::kable(result)
| k | A | B |
|---|---|---|
| 4 | NA | 1.492103 |
| 5 | 1.533576 | 1.533576 |
| 6 | 1.562721 | 1.562721 |
| 7 | 1.584427 | 1.584427 |
| 8 | 1.601166 | 1.601166 |
| 9 | 1.614524 | 1.614524 |
| 10 | 1.625385 | 1.625385 |
| 11 | 1.634415 | 1.634415 |
| 12 | 1.642027 | 1.642027 |
| 13 | 1.648536 | 1.648536 |
| 14 | 1.654175 | 1.654175 |
| 15 | 1.659097 | 1.659097 |
| 16 | 1.663408 | 1.663408 |
| 17 | 1.667301 | 1.667301 |
| 18 | 1.670724 | 1.670724 |
| 19 | 1.673819 | 1.673819 |
| 20 | 1.676589 | 1.676589 |
| 21 | 1.679154 | 1.679154 |
| 22 | 1.681470 | 1.681470 |
| 23 | 1.683610 | 1.683610 |
| 24 | 1.685548 | 1.685548 |
| 25 | 1.687347 | 1.687347 |
| 100 | 1.720610 | 1.720610 |
| 500 | 1.729738 | NA |
| 1000 | 1.730924 | NA |
Compare the solutions B(k) in Exercise 11.5 with the points A(k) in Exercise 11.4, we can find that both solutions are the same when K is the same
nA. <- 28; nB. <- 24; nOO <- 41; nAB <- 70
p <- q <- r <- numeric(100)
p[1] <- 0.2; q[1] <- 0.2; r[1] <- (1- p[1]- q[1])
#Given initial value of iteration
f <- function(a,b) {
return((nB.*b/(2-b-2*a)+nB.+nAB)/(nA.*a/(2-a-2*b)+nA.+nAB))
}
g <- function(a,b) {
return(((1-a/(2-a-2*b))*nA.+(1-b/(2-b-2*a))*nB.+2*nOO)/((nB.*b/(2-b-2*a)+
nB.+nAB)))
}
threshold <- 1e-5
#Given the threshold
for (k in 2:100) {
p[k] <- 1/(1+f(p[k-1],q[k-1])*(1+g(p[k-1],q[k-1])))
q[k] <- f(p[k-1],q[k-1])/(1+f(p[k-1],q[k-1])*(1+g(p[k-1],q[k-1])))
r[k] <- 1- p[k] - q[k]
#Through the theoretical steps of the EM algorithm, the relationship between the iteration value at each step and the previous iteration value is obtained.
if((p[k]-p[k-1] <= threshold) & (q[k]-q[k-1] <= threshold) &
(r[k]-r[k-1] <= threshold))
#If the difference between two iterations of p, q, r is less than a given threshold, stop iteration
{print(c(k, p[k], q[k],r[k]))
break
}
}
## [1] 7.0000000 0.3273433 0.3104259 0.3622308
Through the steps of the EM algorithm, we get the number of iteration and the MLE of p, q and r(consider missing data nAA and nBB).
x <- seq(1,k,1)
plot(x, p[1:k], "b", col = "red",ylim=c(0,0.6), main = "The log-maximum likelihood values in M-steps" , xlab = "The number of iteration", ylab = "The value of iteration")
lines(x, q[1:k], "b", col = "blue")
lines(x, r[1:k], "b", col = "green")
legend("topright", legend = c("p", "q", "r"),lty = 1, col = c("red", "blue", "green"))
We can find that the log-maximum likelihood values of p and q in M-steps are increasing via line plot.
11.1.2 Exercises
Use both for loops and lapply() to fit linear models to the mtcars using the formulas stored in this list:
formulas <- list(
mpg ~ disp,
mpg ~ I(1 / disp),
mpg~disp+wt,
mpg~I(1/disp)+wt
)
Fit the model mpg ~ disp to each of the bootstrap replicates of mtcars in the list below by using a for loop and lapply(). Can you do it without an anonymous function?
bootstraps <- lapply(1:10, function(i) {
rows <- sample(1:nrow(mtcars), rep = TRUE)
mtcars[rows, ]
})
For each model in the previous two exercises, extract R2 using the function below.
rsq <- function(mod) summary(mod)$r.squared11.2.5 Exercises
The following code simulates the performance of a t-test for non-normal data. Use sapply() and an anonymous function to extract the p-value from every trial.
trials <- replicate(
100,
t.test(rpois(10, 10), rpois(7, 10)),
simplify = FALSE
)Extra challenge: get rid of the anonymous function by using [[ directly.
7.Implement mcsapply(), a multicore version of sapply(). Can you implement mcvapply(), a parallel version of vapply()? Why or why not?
formulas <- list(
mpg ~ disp,
mpg ~ I(1 / disp),
mpg ~ disp + wt,
mpg ~ I(1 / disp) + wt
)
v <- numeric(4)
for (i in 1:4)
{
v[i] <- lapply(i,function(i) {lm(formulas[[i]], data = mtcars)})
}
v
## [[1]]
##
## Call:
## lm(formula = formulas[[i]], data = mtcars)
##
## Coefficients:
## (Intercept) disp
## 29.59985 -0.04122
##
##
## [[2]]
##
## Call:
## lm(formula = formulas[[i]], data = mtcars)
##
## Coefficients:
## (Intercept) I(1/disp)
## 10.75 1557.67
##
##
## [[3]]
##
## Call:
## lm(formula = formulas[[i]], data = mtcars)
##
## Coefficients:
## (Intercept) disp wt
## 34.96055 -0.01772 -3.35083
##
##
## [[4]]
##
## Call:
## lm(formula = formulas[[i]], data = mtcars)
##
## Coefficients:
## (Intercept) I(1/disp) wt
## 19.024 1142.560 -1.798
bootstraps <- lapply(1:10, function(i) {
rows <- sample(1:nrow(mtcars), rep = TRUE)
mtcars[rows, ]
})
u <- numeric(10)
for (i in 1:10) {
u[i] <- lapply(bootstraps[i], lm, formula = mpg ~ disp)
}
u
## [[1]]
##
## Call:
## FUN(formula = ..1, data = X[[i]])
##
## Coefficients:
## (Intercept) disp
## 30.29250 -0.04913
##
##
## [[2]]
##
## Call:
## FUN(formula = ..1, data = X[[i]])
##
## Coefficients:
## (Intercept) disp
## 32.42525 -0.04808
##
##
## [[3]]
##
## Call:
## FUN(formula = ..1, data = X[[i]])
##
## Coefficients:
## (Intercept) disp
## 27.92085 -0.03738
##
##
## [[4]]
##
## Call:
## FUN(formula = ..1, data = X[[i]])
##
## Coefficients:
## (Intercept) disp
## 30.84337 -0.04615
##
##
## [[5]]
##
## Call:
## FUN(formula = ..1, data = X[[i]])
##
## Coefficients:
## (Intercept) disp
## 28.51253 -0.03682
##
##
## [[6]]
##
## Call:
## FUN(formula = ..1, data = X[[i]])
##
## Coefficients:
## (Intercept) disp
## 28.69627 -0.03812
##
##
## [[7]]
##
## Call:
## FUN(formula = ..1, data = X[[i]])
##
## Coefficients:
## (Intercept) disp
## 29.73172 -0.04127
##
##
## [[8]]
##
## Call:
## FUN(formula = ..1, data = X[[i]])
##
## Coefficients:
## (Intercept) disp
## 29.03038 -0.04165
##
##
## [[9]]
##
## Call:
## FUN(formula = ..1, data = X[[i]])
##
## Coefficients:
## (Intercept) disp
## 29.9939 -0.0418
##
##
## [[10]]
##
## Call:
## FUN(formula = ..1, data = X[[i]])
##
## Coefficients:
## (Intercept) disp
## 28.62946 -0.04011
formulas <- list(
mpg ~ disp,
mpg ~ I(1 / disp),
mpg ~ disp + wt,
mpg ~ I(1 / disp) + wt
)
v <- numeric(4)
rsq <- function(mod) summary(mod)$r.squared
for (i in 1:4)
{
v[i] <- lapply(i,function(i) {rsq(lm(formulas[[i]], data = mtcars))}
)
}
v
## [[1]]
## [1] 0.7183433
##
## [[2]]
## [1] 0.8596865
##
## [[3]]
## [1] 0.7809306
##
## [[4]]
## [1] 0.8838038
bootstraps <- lapply(1:10, function(i) {
rows <- sample(1:nrow(mtcars), rep = TRUE)
mtcars[rows, ]
})
u <- numeric(10)
rsq <- function(mod) summary(mod)$r.squared
for (i in 1:10) {
u[i] <- lapply(i, function(i) rsq(lm(mpg ~ disp, data = bootstraps[[i]])))
}
u
## [[1]]
## [1] 0.5934501
##
## [[2]]
## [1] 0.6809705
##
## [[3]]
## [1] 0.7051608
##
## [[4]]
## [1] 0.774844
##
## [[5]]
## [1] 0.7335668
##
## [[6]]
## [1] 0.6659116
##
## [[7]]
## [1] 0.6834858
##
## [[8]]
## [1] 0.7760479
##
## [[9]]
## [1] 0.8187859
##
## [[10]]
## [1] 0.819337
trials <- replicate(
100,
t.test(rpois(10, 10), rpois(7, 10)),
simplify = FALSE
)
u <- numeric(100)
for (i in 1:100) {
u[i] <- sapply(i, function(i) trials[[i]]$p.value)
}
u
## [1] 0.864070020 0.105039587 0.190862925 0.056823493 0.406571240
## [6] 0.040538612 0.511503629 0.132517699 0.178841071 0.217510693
## [11] 0.761834379 0.858064206 0.057840963 0.355723094 0.653715063
## [16] 0.814481575 0.724040362 0.421870128 0.827385142 0.837643001
## [21] 0.473042063 0.621388894 0.959622063 0.307964515 0.304054985
## [26] 0.321495808 0.579810263 0.696476954 0.409212929 0.601596926
## [31] 0.096394818 0.190862925 0.455087758 0.608012345 0.908282897
## [36] 0.229295198 0.415917133 0.857016511 0.294107747 0.686243306
## [41] 0.678054866 0.707680859 0.840619960 0.063519839 0.550409294
## [46] 0.122005290 0.799307503 0.537074500 0.569415696 0.256213232
## [51] 0.348723849 0.089359850 0.811364014 0.076449530 0.802230991
## [56] 0.551710147 0.819112954 0.589525889 0.050932861 0.504847216
## [61] 0.073982727 0.352554634 0.968094012 0.045725516 0.224053660
## [66] 0.581710632 0.883185527 0.975643002 0.317584895 0.624627943
## [71] 0.836190385 0.753126928 0.893612862 0.115556712 0.982768412
## [76] 0.613779143 0.390456283 0.272880769 0.673438449 0.759262533
## [81] 0.282099595 0.354691449 0.837944803 0.656699881 0.937186390
## [86] 0.575408118 0.625706452 0.973709854 0.274309061 0.658811095
## [91] 0.894351837 0.003972315 0.100443042 0.822662528 0.467721760
## [96] 0.460471075 0.727418397 0.017716288 0.941029327 0.560349860
sapply(trials, "[[", 3)
## [1] 0.864070020 0.105039587 0.190862925 0.056823493 0.406571240
## [6] 0.040538612 0.511503629 0.132517699 0.178841071 0.217510693
## [11] 0.761834379 0.858064206 0.057840963 0.355723094 0.653715063
## [16] 0.814481575 0.724040362 0.421870128 0.827385142 0.837643001
## [21] 0.473042063 0.621388894 0.959622063 0.307964515 0.304054985
## [26] 0.321495808 0.579810263 0.696476954 0.409212929 0.601596926
## [31] 0.096394818 0.190862925 0.455087758 0.608012345 0.908282897
## [36] 0.229295198 0.415917133 0.857016511 0.294107747 0.686243306
## [41] 0.678054866 0.707680859 0.840619960 0.063519839 0.550409294
## [46] 0.122005290 0.799307503 0.537074500 0.569415696 0.256213232
## [51] 0.348723849 0.089359850 0.811364014 0.076449530 0.802230991
## [56] 0.551710147 0.819112954 0.589525889 0.050932861 0.504847216
## [61] 0.073982727 0.352554634 0.968094012 0.045725516 0.224053660
## [66] 0.581710632 0.883185527 0.975643002 0.317584895 0.624627943
## [71] 0.836190385 0.753126928 0.893612862 0.115556712 0.982768412
## [76] 0.613779143 0.390456283 0.272880769 0.673438449 0.759262533
## [81] 0.282099595 0.354691449 0.837944803 0.656699881 0.937186390
## [86] 0.575408118 0.625706452 0.973709854 0.274309061 0.658811095
## [91] 0.894351837 0.003972315 0.100443042 0.822662528 0.467721760
## [96] 0.460471075 0.727418397 0.017716288 0.941029327 0.560349860
library(parallel)
# mcsapply()
mcsapply<-function(k,f){
cl <- makeCluster(getOption("cl.cores", 4))
result<-parLapply(cl,k,f)
stopCluster(cl)
return(unlist(result))
}
trials <- replicate(
5000,
t.test(rpois(12, 22), rpois(4, 34)),
simplify = FALSE
)
system.time(mcsapply(trials,function(x) unlist(x)[3]))
## 用户 系统 流逝
## 0.047 0.011 0.840
system.time(sapply(trials,function(x) unlist(x)[3]))
## 用户 系统 流逝
## 0.09 0.00 0.09
#The mcsapply function has a significantly higher operating efficiency than sapply function
Reason: Vapply() directly output non-list, so I can’t transform mclappy() to mcvapply().
set.seed(42)
rw_MetropolisR <- function(sigma, x0, N)
{
#Metropolis Randomwalk using R
x <- numeric(N)
x[1] <- x0
u <- runif(N)
k <- 0
for (i in 2:N) {
y <- rnorm(1, x[i-1], sigma)
if (u[i] <= exp(-(abs(y) - abs(x[i-1]))))
x[i] <- y else {
x[i] <- x[i-1]
k <- k + 1
}
}
return(list(x=x, k=k))
}
library(Rcpp)
#// This is the rw_MetropolisC.cpp
#include <Rcpp.h>
#using namespace Rcpp;
#// [[Rcpp::export]]
cppFunction('NumericVector rw_MetropolisC(double sigma, double x0, int N)
{
//Metropolis Randomwalk using C
NumericVector x(N);
x[0] = x0;
double u, y;
int k = 0;
for (int i = 1; i < N; i++)
{
y = rnorm(1, x[i-1], sigma)[0];
u = runif(1)[0];
if (u <= exp(-(abs(y) - abs(x[i-1]))))
{
x[i] = y;
}
else
{
x[i] = x[i-1];
k++;
}
}
return x;
}')
library(microbenchmark)
N = 2000
sigma <- c(0.5,1,10,100)
x0 = 25
for (i in 1:length(sigma)) {
ts = microbenchmark(rwR = rw_MetropolisR(sigma[i], x0, N)$x,
rwC = rw_MetropolisC(sigma[i], x0, N))
print(summary(ts)[, c(1,3,5,6)])
rwR = rw_MetropolisR(sigma[i], x0, N)$x
rwC = rw_MetropolisC(sigma[i], x0, N)
par(mar=c(1,1,1,1))
par(mfrow = c(2, 2))
b <- 1000 #discard the burnin sample
y <- (rwR)[b:N]
a <- ppoints(500)
QR <- ifelse(a <= 1/2, log(2*a), -log(2-2*a)) #quantiles of Laplace
Q1 <- quantile(rwR, a)
qqplot(QR, Q1, main=paste("R sigma=",sigma[i]) , xlab="Laplace Quantiles", ylab="Sample Quantiles", col = "red")
abline(a=0, b=1)
y <- (rwC)[b:N]
a <- ppoints(500)
QR <- ifelse(a <= 1/2, log(2*a), -log(2-2*a)) #quantiles of Laplace
Q2 <- quantile(rwC, a)
qqplot(QR, Q2, main=paste("C sigma=",sigma[i]) , xlab="Laplace Quantiles", ylab="Sample Quantiles", col = "red")
abline(a=0, b=1)
qqplot(Q1, Q2, main=paste("C-R sigma=",sigma[i]) , xlab="C Quantiles", ylab="R Quantiles", col = "blue")
abline(a=0, b=1)
}
## expr lq median uq
## 1 rwR 3470.742 3564.614 3860.547
## 2 rwC 265.921 272.559 284.520
## expr lq median uq
## 1 rwR 3480.555 3584.0380 3784.814
## 2 rwC 268.972 274.8655 283.367
## expr lq median uq
## 1 rwR 3499.8935 3574.7095 3702.9955
## 2 rwC 259.4555 264.1225 273.5515
## expr lq median uq
## 1 rwR 3527.4415 3638.733 3864.512
## 2 rwC 263.1895 270.351 278.190
The random number fluctuations generated by the two functions tend to be consistent, that is, they are basically derived from the same distribution family,and C’s operating efficiency is significantly higher than R,so I think C is better than R